import requests
import tldextract
from bs4 import BeautifulSoup
from common.config import config_option


def get_url():
    """获取URL"""
    r_file = open(config_option['project_path'] + "/data/domain.txt")
    url_list = [line.strip().split("\t")[0] for line in r_file.readlines()[1:]]
    r_file.close()
    return url_list


def crawl_html(url_list):
    """
    抓取网页
        result.status_code: 访问状态
        result.encoding: 网页编码
        result.text: 网页内容
    """
    content_list = []
    for url in url_list:
        try:
            print("*" * 20)
            print("URL:", url)
            result = requests.get("https://" + url)

            bs = BeautifulSoup(result.text, 'html.parser')
            title = bs.find('title').text
            print("网页主题：", title)
            # script_list = bs.find_all('script')

            content_list.append((url, str(result.status_code), title))
        except Exception as e:
            print(e)
            content_list.append((url, 'NULL', 'NULL'))

    return content_list


def save_content(file_path, content_list):
    """保存内容"""
    w_file = open(file_path, "w", encoding="utf-8")
    for content in content_list:
        w_file.writelines("\t".join(content) + "\n")
    w_file.close()


def run():
    """运行程序"""
    url_list = get_url()
    content_list = crawl_html(url_list)
    save_content(config_option['project_path'] + "/output/domain_result.txt", content_list)



if __name__ == '__main__':
    run()
